Setup
n<-nrow(subset_selected)
set.seed(123)
n_s = sample(n,10000)
train = subset_selected[n_s, ]
crossValidationCont = function(df,K,a,nperfmeas=6)
{ set.seed(123)
n = nrow(df)
nhold = round(n/K) # size of holdout set
iperm = sample(n)
perfmeas = matrix(0,K,nperfmeas)
models = vector(mode="list", length=3)
year = vector(mode="list", length=3)
for(k in 1:K)
{ indices = (((k-1)*nhold+1):(k*nhold))
if( k==K ) indices = (((k-1)*nhold+1):n)
indices = iperm[indices]
traine = df[-indices,]
holdoute = df[indices,]
models[[k]] = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine)
pred = predict(models[[k]],newdata=holdoute,interval="prediction",level=a)
RSS = c(crossprod(models[[k]]$residuals))
MSE = RSS / length(models[[k]]$residuals)
x = intervalScore(pred, holdoute$price, a)
y= sqrt(MSE)
z = summary(models[[k]])$r.squared
perfmeas[k,1:4] = x$summary
perfmeas[k,5] = y
perfmeas[k,6] = z
year[[k]] = traine$year
}
avgperfmeas = apply(perfmeas,2,mean)
list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year)
}
crossValidationCont2 = function(df,K,a,nperfmeas=6)
{ set.seed(123)
n = nrow(df)
nhold = round(n/K) # size of holdout set
iperm = sample(n)
perfmeas = matrix(0,K,nperfmeas)
models = vector(mode="list", length=3)
year = vector(mode="list", length=3)
predi = vector(mode="list", length=3)
pri = vector(mode="list", length=3)
for(k in 1:K)
{ indices = (((k-1)*nhold+1):(k*nhold))
if( k==K ) indices = (((k-1)*nhold+1):n)
indices = iperm[indices]
traine = df[-indices,]
holdoute = df[indices,]
models[[k]] = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine, weights = 1/(traine$year)^2)
pred = predict(models[[k]],newdata=holdoute,interval="prediction",level=a, weights = 1/(holdoute$year)^2)
RSS = c(crossprod(models[[k]]$residuals))
MSE = RSS / length(models[[k]]$residuals)
x = intervalScore(pred, holdoute$price, a)
y= sqrt(MSE)
z = summary(models[[k]])$r.squared
perfmeas[k,1:4] = x$summary
perfmeas[k,5] = y
perfmeas[k,6] = z
year[[k]] = traine$year
predi[[k]] = pred
pri[[k]] = holdoute$price
}
avgperfmeas = apply(perfmeas,2,mean)
list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year, pred = predi, price = pri)
}
crossValidationCont3 = function(df,K,a,nperfmeas=6)
{ set.seed(123)
n = nrow(df)
nhold = round(n/K) # size of holdout set
iperm = sample(n)
perfmeas = matrix(0,K,nperfmeas)
models = vector(mode="list", length=3)
year = vector(mode="list", length=3)
predi = vector(mode="list", length=3)
pri = vector(mode="list", length=3)
for(k in 1:K)
{ indices = (((k-1)*nhold+1):(k*nhold))
if( k==K ) indices = (((k-1)*nhold+1):n)
indices = iperm[indices]
traine = df[-indices,]
holdoute = df[indices,]
models[[k]] = lm(I(log(price))~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine)
pred1 = predict(models[[k]],newdata=holdoute,interval="prediction",level=a)
pred = exp(pred1)
RSS = c(crossprod(models[[k]]$residuals))
MSE = RSS / length(models[[k]]$residuals)
x = intervalScore(pred, holdoute$price, a)
y= sqrt(MSE)
z = summary(models[[k]])$r.squared
perfmeas[k,1:4] = x$summary
perfmeas[k,5] = y
perfmeas[k,6] = z
year[[k]] = traine$year
predi[[k]] = pred
pri[[k]] = holdoute$price
}
avgperfmeas = apply(perfmeas,2,mean)
list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year, pred = predi, price = pri)
}
intervalScore = function(predObj,actual,level) {
n = nrow(predObj)
alpha = 1-level
ilow = (actual<predObj[,2]) # overestimation
ihigh = (actual>predObj[,3]) # underestimation
sumlength = sum(predObj[,3]-predObj[,2]) # sum of lengths of prediction intervals
sumlow = sum(predObj[ilow,2]-actual[ilow])*2/alpha
sumhigh = sum(actual[ihigh]-predObj[ihigh,3])*2/alpha
avglength = sumlength/n
IS = (sumlength+sumlow+sumhigh)/n # average length + average under/over penalties
cover = mean(actual>= predObj[,2] & actual<=predObj[,3])
summ = c(level,avglength,IS,cover)
# summary with level, average length, interval score, coverage rate, r^2, rmse
imiss = which(ilow | ihigh)
list(summary=summ, imiss=imiss)
}
ols_step_best_subset(lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type+gdpc,data=train))
Best Subsets Regression
--------------------------------------------------------------------------
Model Index Predictors
--------------------------------------------------------------------------
1 year
2 year car_type
3 year mileage_cat car_type
4 year mileage_cat fuel car_type
5 mark_cat year mileage_cat fuel car_type
6 mark_cat year mileage_cat log_vol_engine fuel car_type
7 mark_cat year mileage_cat log_vol_engine fuel car_type gdpc
--------------------------------------------------------------------------
Subsets Regression Summary
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Adj. Pred
Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
--------------------------------------------------------------------------------------------------------------------------------------------------------------
1 0.4331 0.4330 0.4327 11826.1632 213461.9378 185080.4132 213483.5688 1.091154e+12 109137238.4019 10914.8160 0.5672
2 0.6354 0.6352 0.6348 4041.5913 209056.6401 180669.9789 209107.1124 701879354586.3881 70230066.4790 7023.7099 0.3649
3 0.7074 0.7071 0.7067 1268.9664 206864.1012 178470.4096 206950.6253 563184727342.0479 56380481.1251 5638.6131 0.2928
4 0.7245 0.7241 0.7235 614.7308 206269.6787 177872.3313 206377.8338 530418063646.8668 53116154.1273 5312.1482 0.2758
5 0.7337 0.7333 0.7327 261.4335 205934.9011 177533.8372 206064.6872 512698419504.8883 51357128.5591 5136.2286 0.2666
6 0.7407 0.7402 0.7393 -3.6592 205672.8528 177272.1069 205809.8493 499387849825.0785 50028809.8706 5003.3841 0.2597
7 0.7407 0.7402 0.7393 -3.0000 205673.5096 177272.7693 205817.7164 499370749041.4341 50032100.9365 5003.7141 0.2597
--------------------------------------------------------------------------------------------------------------------------------------------------------------
AIC: Akaike Information Criteria
SBIC: Sawa's Bayesian Information Criteria
SBC: Schwarz Bayesian Criteria
MSEP: Estimated error of prediction, assuming multivariate normality
FPE: Final Prediction Error
HSP: Hocking's Sp
APC: Amemiya Prediction Criteria
reg_model= crossValidationCont(train,3,0.5,6)
weighted_model = crossValidationCont2(train,3,0.5,6)
transformed_model = crossValidationCont3(train,3,0.8,6)
d = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=train)
plot(d$fitted.values,d$residuals, ylab="Residuals", xlab = "Fitted Values", main = "OLS Residuals")

plot(train$year,d$residuals, ylab="Residuals", xlab = "Year",main = "Year against OLS Residuals")

de = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=train, weights = 1/(train$year)^2)
plot(de$fitted.values,de$residuals, ylab="Residuals", xlab = "Fitted Values",main = "WLS Residuals")

plot(reg_model$models[[1]]$fitted.values,reg_model$models[[1]]$residuals)

plot(reg_model$models[[2]]$fitted.values,reg_model$models[[2]]$residuals)

plot(reg_model$models[[3]]$fitted.values,reg_model$models[[3]]$residuals)

plot(reg_model$year[[1]],reg_model$models[[1]]$residuals)

plot(reg_model$year[[2]],reg_model$models[[2]]$residuals)

plot(reg_model$year[[3]],reg_model$models[[3]]$residuals)

plot(weighted_model$models[[1]]$fitted.values,weighted_model$models[[1]]$residuals)

plot(weighted_model$models[[2]]$fitted.values,weighted_model$models[[2]]$residuals)

plot(weighted_model$models[[3]]$fitted.values,weighted_model$models[[3]]$residuals)

plot(transformed_model$models[[1]]$fitted.values,transformed_model$models[[1]]$residuals, main = "Fold 1", xlab = "Fitted Values", ylab = "Residuals")

plot(transformed_model$models[[2]]$fitted.values,transformed_model$models[[2]]$residuals, main = "Fold 2", xlab = "Fitted Values", ylab = "Residuals")

plot(transformed_model$models[[3]]$fitted.values,transformed_model$models[[3]]$residuals, main = "Fold 3", xlab = "Fitted Values", ylab = "Residuals")

res1 = transformed_model$price[[1]] - exp(transformed_model$models[[1]]$fitted.values)
longer object length is not a multiple of shorter object length
res2 = transformed_model$price[[2]] - exp(transformed_model$models[[2]]$fitted.values)
longer object length is not a multiple of shorter object length
res3 = transformed_model$price[[3]] - exp(transformed_model$models[[3]]$fitted.values)
longer object length is not a multiple of shorter object length
plot(exp(transformed_model$models[[1]]$fitted.values),res1, main = "Fold 1", xlab = "Fitted Values", ylab = "Residuals")

plot(exp(transformed_model$models[[2]]$fitted.values),res2, main = "Fold 2", xlab = "Fitted Values", ylab = "Residuals")

plot(exp(transformed_model$models[[3]]$fitted.values),res3, main = "Fold 3", xlab = "Fitted Values", ylab = "Residuals")

---
title: "R Notebook"
output: html_notebook
---
# Setup
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(mlbench)
library(randomForest)
library(party)
library(rpart)
library(rpart.plot)
library(leaps)
library(olsrr)
library(magrittr)
library(tibble)
```

```{r}
n<-nrow(subset_selected)
set.seed(123)
n_s = sample(n,10000)
train = subset_selected[n_s, ]
```


```{r}
crossValidationCont = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine)
    pred = predict(models[[k]],newdata=holdoute,interval="prediction",level=a)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year)
}
```

```{r}
crossValidationCont2 = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  predi = vector(mode="list", length=3)
  pri = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine, weights = 1/(traine$year)^2)
    pred = predict(models[[k]],newdata=holdoute,interval="prediction",level=a, weights = 1/(holdoute$year)^2)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
    predi[[k]] = pred
    pri[[k]] = holdoute$price
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year, pred = predi, price = pri)
}
```

```{r}
crossValidationCont3 = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  predi = vector(mode="list", length=3)
  pri = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(I(log(price))~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine)
    pred1 = predict(models[[k]],newdata=holdoute,interval="prediction",level=a)
    pred = exp(pred1)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
    predi[[k]] = pred
    pri[[k]] = holdoute$price
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year, pred = predi, price = pri)
}
```

```{r}
intervalScore = function(predObj,actual,level) { 
n = nrow(predObj)
alpha = 1-level
ilow = (actual<predObj[,2]) # overestimation
ihigh = (actual>predObj[,3]) # underestimation
sumlength = sum(predObj[,3]-predObj[,2]) # sum of lengths of prediction intervals 
sumlow = sum(predObj[ilow,2]-actual[ilow])*2/alpha
sumhigh = sum(actual[ihigh]-predObj[ihigh,3])*2/alpha
avglength = sumlength/n
IS = (sumlength+sumlow+sumhigh)/n # average length + average under/over penalties 
cover = mean(actual>= predObj[,2] & actual<=predObj[,3])
summ = c(level,avglength,IS,cover)
# summary with level, average length, interval score, coverage rate, r^2, rmse
imiss = which(ilow | ihigh)
list(summary=summ, imiss=imiss)
}
ols_step_best_subset(lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type+gdpc,data=train))
reg_model= crossValidationCont(train,3,0.5,6)
weighted_model = crossValidationCont2(train,3,0.5,6)
transformed_model = crossValidationCont3(train,3,0.8,6)
```

```{r}
d = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=train)
plot(d$fitted.values,d$residuals, ylab="Residuals", xlab = "Fitted Values", main = "OLS Residuals")
plot(train$year,d$residuals, ylab="Residuals", xlab = "Year",main = "Year against OLS Residuals")
de = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=train, weights = 1/(train$year)^2)
plot(de$fitted.values,de$residuals, ylab="Residuals", xlab = "Fitted Values",main = "WLS Residuals")
```


```{r}
plot(reg_model$models[[1]]$fitted.values,reg_model$models[[1]]$residuals)
plot(reg_model$models[[2]]$fitted.values,reg_model$models[[2]]$residuals)
plot(reg_model$models[[3]]$fitted.values,reg_model$models[[3]]$residuals)
```

```{r}
plot(reg_model$year[[1]],reg_model$models[[1]]$residuals)
plot(reg_model$year[[2]],reg_model$models[[2]]$residuals)
plot(reg_model$year[[3]],reg_model$models[[3]]$residuals)
```

```{r}
plot(weighted_model$models[[1]]$fitted.values,weighted_model$models[[1]]$residuals)
plot(weighted_model$models[[2]]$fitted.values,weighted_model$models[[2]]$residuals)
plot(weighted_model$models[[3]]$fitted.values,weighted_model$models[[3]]$residuals)
```
```{r}
plot(transformed_model$models[[1]]$fitted.values,transformed_model$models[[1]]$residuals, main = "Fold 1", xlab = "Fitted Values", ylab = "Residuals")
plot(transformed_model$models[[2]]$fitted.values,transformed_model$models[[2]]$residuals, main = "Fold 2", xlab = "Fitted Values", ylab = "Residuals")
plot(transformed_model$models[[3]]$fitted.values,transformed_model$models[[3]]$residuals, main = "Fold 3", xlab = "Fitted Values", ylab = "Residuals")
```

```{r}
res1 = transformed_model$price[[1]]  - exp(transformed_model$models[[1]]$fitted.values)
res2 = transformed_model$price[[2]]  - exp(transformed_model$models[[2]]$fitted.values)
res3 = transformed_model$price[[3]]  - exp(transformed_model$models[[3]]$fitted.values)
plot(exp(transformed_model$models[[1]]$fitted.values),res1, main = "Fold 1", xlab = "Fitted Values", ylab = "Residuals")
plot(exp(transformed_model$models[[2]]$fitted.values),res2, main = "Fold 2", xlab = "Fitted Values", ylab = "Residuals")
plot(exp(transformed_model$models[[3]]$fitted.values),res3, main = "Fold 3", xlab = "Fitted Values", ylab = "Residuals")
```